{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Monday: The doctor's appointment is at 2:45pm.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Tuesday: The dentist's appointment is at 11:30...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Wednesday: At 7:00pm, there is a basketball game!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Thursday: Be back home by 11:15 pm at the latest.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Friday: Take the train at 08:10 am, arrive at ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                text\n",
       "0     Monday: The doctor's appointment is at 2:45pm.\n",
       "1  Tuesday: The dentist's appointment is at 11:30...\n",
       "2  Wednesday: At 7:00pm, there is a basketball game!\n",
       "3  Thursday: Be back home by 11:15 pm at the latest.\n",
       "4  Friday: Take the train at 08:10 am, arrive at ..."
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "time_sentences = [\"Monday: The doctor's appointment is at 2:45pm.\", \n",
    "                  \"Tuesday: The dentist's appointment is at 11:30 am.\",\n",
    "                  \"Wednesday: At 7:00pm, there is a basketball game!\",\n",
    "                  \"Thursday: Be back home by 11:15 pm at the latest.\",\n",
    "                  \"Friday: Take the train at 08:10 am, arrive at 09:00am.\"]\n",
    "df = pd.DataFrame(time_sentences, columns = ['text'])\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    46\n",
       "1    50\n",
       "2    49\n",
       "3    49\n",
       "4    54\n",
       "Name: text, dtype: int64"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# find the number of characters for each string in df['text']\n",
    "df['text'].str.len()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     7\n",
       "1     8\n",
       "2     8\n",
       "3    10\n",
       "4    10\n",
       "Name: text, dtype: int64"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# find the number of tokens for each string in df['text']\n",
    "df['text'].str.split().str.len()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     True\n",
       "1     True\n",
       "2    False\n",
       "3    False\n",
       "4    False\n",
       "Name: text, dtype: bool"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# find which entries contain the word 'appointment'\n",
    "df['text'].str.contains('appointment')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    3\n",
       "1    4\n",
       "2    3\n",
       "3    4\n",
       "4    8\n",
       "Name: text, dtype: int64"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# find how many times a digit occurs in each string\n",
    "df['text'].str.count(r'\\d')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0                   [2, 4, 5]\n",
       "1                [1, 1, 3, 0]\n",
       "2                   [7, 0, 0]\n",
       "3                [1, 1, 1, 5]\n",
       "4    [0, 8, 1, 0, 0, 9, 0, 0]\n",
       "Name: text, dtype: object"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# find all occurances of the digits\n",
    "df['text'].str.findall(r'\\d')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0               [(2, 45)]\n",
       "1              [(11, 30)]\n",
       "2               [(7, 00)]\n",
       "3              [(11, 15)]\n",
       "4    [(08, 10), (09, 00)]\n",
       "Name: text, dtype: object"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# group and find the hours and minutes\n",
    "df['text'].str.findall(r'(\\d?\\d):(\\d\\d)')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0          ???: The doctor's appointment is at 2:45pm.\n",
       "1       ???: The dentist's appointment is at 11:30 am.\n",
       "2          ???: At 7:00pm, there is a basketball game!\n",
       "3         ???: Be back home by 11:15 pm at the latest.\n",
       "4    ???: Take the train at 08:10 am, arrive at 09:...\n",
       "Name: text, dtype: object"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['text'].str.replace(r'\\w+day\\b', '???')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0          Mon: The doctor's appointment is at 2:45pm.\n",
       "1       Tue: The dentist's appointment is at 11:30 am.\n",
       "2          Wed: At 7:00pm, there is a basketball game!\n",
       "3         Thu: Be back home by 11:15 pm at the latest.\n",
       "4    Fri: Take the train at 08:10 am, arrive at 09:...\n",
       "Name: text, dtype: object"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['text'].str.replace(r'(\\w+day\\b)', lambda x: x.groups()[0][:3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Susan\\Anaconda3\\lib\\site-packages\\ipykernel\\__main__.py:2: FutureWarning: currently extract(expand=None) means expand=False (return Index/Series/DataFrame) but in a future version of pandas this will be changed to expand=True (return DataFrame)\n",
      "  from ipykernel import kernelapp as app\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>11</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7</td>\n",
       "      <td>00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>11</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>08</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0   1\n",
       "0   2  45\n",
       "1  11  30\n",
       "2   7  00\n",
       "3  11  15\n",
       "4  08  10"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create new columns from first match of extracted groups\n",
    "df['text'].str.extract(r'(\\d?\\d):(\\d\\d)')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>match</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <th>0</th>\n",
       "      <td>2:45pm</td>\n",
       "      <td>2</td>\n",
       "      <td>45</td>\n",
       "      <td>pm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <th>0</th>\n",
       "      <td>11:30 am</td>\n",
       "      <td>11</td>\n",
       "      <td>30</td>\n",
       "      <td>am</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <th>0</th>\n",
       "      <td>7:00pm</td>\n",
       "      <td>7</td>\n",
       "      <td>00</td>\n",
       "      <td>pm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <th>0</th>\n",
       "      <td>11:15 pm</td>\n",
       "      <td>11</td>\n",
       "      <td>15</td>\n",
       "      <td>pm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">4</th>\n",
       "      <th>0</th>\n",
       "      <td>08:10 am</td>\n",
       "      <td>08</td>\n",
       "      <td>10</td>\n",
       "      <td>am</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>09:00am</td>\n",
       "      <td>09</td>\n",
       "      <td>00</td>\n",
       "      <td>am</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                0   1   2   3\n",
       "  match                      \n",
       "0 0        2:45pm   2  45  pm\n",
       "1 0      11:30 am  11  30  am\n",
       "2 0        7:00pm   7  00  pm\n",
       "3 0      11:15 pm  11  15  pm\n",
       "4 0      08:10 am  08  10  am\n",
       "  1       09:00am  09  00  am"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# extract the entire time, the hours, the minutes, and the period\n",
    "df['text'].str.extractall(r'((\\d?\\d):(\\d\\d) ?([ap]m))')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>time</th>\n",
       "      <th>hour</th>\n",
       "      <th>minute</th>\n",
       "      <th>period</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>match</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <th>0</th>\n",
       "      <td>2:45pm</td>\n",
       "      <td>2</td>\n",
       "      <td>45</td>\n",
       "      <td>pm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <th>0</th>\n",
       "      <td>11:30 am</td>\n",
       "      <td>11</td>\n",
       "      <td>30</td>\n",
       "      <td>am</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <th>0</th>\n",
       "      <td>7:00pm</td>\n",
       "      <td>7</td>\n",
       "      <td>00</td>\n",
       "      <td>pm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <th>0</th>\n",
       "      <td>11:15 pm</td>\n",
       "      <td>11</td>\n",
       "      <td>15</td>\n",
       "      <td>pm</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">4</th>\n",
       "      <th>0</th>\n",
       "      <td>08:10 am</td>\n",
       "      <td>08</td>\n",
       "      <td>10</td>\n",
       "      <td>am</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>09:00am</td>\n",
       "      <td>09</td>\n",
       "      <td>00</td>\n",
       "      <td>am</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             time hour minute period\n",
       "  match                             \n",
       "0 0        2:45pm    2     45     pm\n",
       "1 0      11:30 am   11     30     am\n",
       "2 0        7:00pm    7     00     pm\n",
       "3 0      11:15 pm   11     15     pm\n",
       "4 0      08:10 am   08     10     am\n",
       "  1       09:00am   09     00     am"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['text'].str.extractall(r'(?P<time>(?P<hour>\\d?\\d):(?P<minute>\\d\\d) ?(?P<period>[ap]m))')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
